About the project

I’m expect to learn how to do Rstudio with huge data. In addition, I want to know about how to visualize the data properly.

Link: https://github.com/swinesci/IODS-project

I have a problem with chapter 2

Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you can use R Markdown syntax.


Excercise 2

to see the data

df <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/learning2014.txt", sep=",", header=TRUE)

to see structure and dimmension

str(df)
## 'data.frame':    166 obs. of  7 variables:
##  $ gender  : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
##  $ age     : int  53 55 49 53 49 38 50 37 37 42 ...
##  $ attitude: num  3.7 3.1 2.5 3.5 3.7 3.8 3.5 2.9 3.8 2.1 ...
##  $ deep    : num  3.58 2.92 3.5 3.5 3.67 ...
##  $ stra    : num  3.38 2.75 3.62 3.12 3.62 ...
##  $ surf    : num  2.58 3.17 2.25 2.25 2.83 ...
##  $ points  : int  25 12 24 10 22 21 21 31 24 26 ...
dim(df)
## [1] 166   7

to acess the ggplot 2 library

library(ggplot2)
ggplot(df)

pairs(df[-1])

## to access the GGally and ggplot2

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)

to see summary

summary(df)
##  gender       age           attitude          deep            stra      
##  F:110   Min.   :17.00   Min.   :1.400   Min.   :1.583   Min.   :1.250  
##  M: 56   1st Qu.:21.00   1st Qu.:2.600   1st Qu.:3.333   1st Qu.:2.625  
##          Median :22.00   Median :3.200   Median :3.667   Median :3.188  
##          Mean   :25.51   Mean   :3.143   Mean   :3.680   Mean   :3.121  
##          3rd Qu.:27.00   3rd Qu.:3.700   3rd Qu.:4.083   3rd Qu.:3.625  
##          Max.   :55.00   Max.   :5.000   Max.   :4.917   Max.   :5.000  
##       surf           points     
##  Min.   :1.583   Min.   : 7.00  
##  1st Qu.:2.417   1st Qu.:19.00  
##  Median :2.833   Median :23.00  
##  Mean   :2.787   Mean   :22.72  
##  3rd Qu.:3.167   3rd Qu.:27.75  
##  Max.   :4.333   Max.   :33.00

create a more advanced plot matrix with ggpairs

p <- ggpairs(df, maaping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20)))
## Warning in warn_if_args_exist(list(...)): Extra arguments: "maaping" are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.

to draw the plot

p

##The three explanatory variables are “attitude”, “strategy” and “surface”. These are choosen based on r value

to see scatter plot

qplot(attitude, points, data = df) + geom_smooth(method = "lm")

## fit a linear model

model <- lm(points ~1, data = df)

summary of the model

summary(model)
## 
## Call:
## lm(formula = points ~ 1, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7169  -3.7169   0.2831   5.0331  10.2831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.7169     0.4575   49.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.895 on 165 degrees of freedom

to create an plot matrix

library(GGally)
ggpairs(df, lower = list(combo = wrap("facethist", bins = 20)))

to create a regression model with multibple explanatory variables

model2<- lm(points ~ attitude + stra + surf, data = df)
summary(model2)
## 
## Call:
## lm(formula = points ~ attitude + stra + surf, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1550  -3.4346   0.5156   3.6401  10.8952 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.0171     3.6837   2.991  0.00322 ** 
## attitude      3.3952     0.5741   5.913 1.93e-08 ***
## stra          0.8531     0.5416   1.575  0.11716    
## surf         -0.5861     0.8014  -0.731  0.46563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared:  0.2074, Adjusted R-squared:  0.1927 
## F-statistic: 14.13 on 3 and 162 DF,  p-value: 3.156e-08

remove the surf because its p value was the highest

model3<- lm(points ~attitude + stra, data=df)
summary(model3)
## 
## Call:
## lm(formula = points ~ attitude + stra, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6436  -3.3113   0.5575   3.7928  10.9295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.9729     2.3959   3.745  0.00025 ***
## attitude      3.4658     0.5652   6.132 6.31e-09 ***
## stra          0.9137     0.5345   1.709  0.08927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.289 on 163 degrees of freedom
## Multiple R-squared:  0.2048, Adjusted R-squared:  0.1951 
## F-statistic: 20.99 on 2 and 163 DF,  p-value: 7.734e-09

the most significant variable was attitude. Stra had only tendency

to draw diagnostic plots

par(mfrow = c(2,2))
plot(model3, which = c(1,2,5))

QQ plot showed this model is normally distributed all looks good


Excercise 3

alc <- read.csv("Z:\\IODS-project\\data\\alc.csv", sep = ",")

access the tidyverse libraries tidyr, dplyr, ggplot2

library(tidyr); library(dplyr); library(ggplot2)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

glimpse at the alc data

glimpse(alc) 
## Observations: 382
## Variables: 36
## $ X          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ school     <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex        <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age        <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address    <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize    <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus    <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu       <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu       <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob       <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob       <fct> teacher, other, other, services, other, other, othe...
## $ reason     <fct> course, course, other, home, home, reputation, home...
## $ nursery    <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet   <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian   <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime  <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures   <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup  <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup     <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid       <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic   <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel     <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime   <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout      <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc       <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc       <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health     <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences   <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1         <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2         <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3         <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use    <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use   <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...

use gather() to gather columns into key-value pairs and then glimpse() at the resulting data

gather(alc) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,752
## Variables: 2
## $ key   <chr> "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "...
## $ value <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",...

draw a bar plot of each variable

gather(alc) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped

access the tidyverse libraries dplyr and ggplot2

library(dplyr)
library(ggplot2)

produce summary statistics by group

alc %>% group_by(sex) %>% summarise(count = n())
## # A tibble: 2 x 2
##   sex   count
##   <fct> <int>
## 1 F       198
## 2 M       184

library(ggplot2)

initialize a plot of high_use and G3

g1 <- ggplot(alc, aes(x = high_use, y = G3, col = sex))

define the plot as a boxplot and draw it

g1 + geom_boxplot() + ylab("grade")

initialise a plot of high_use and absences

g2 <- ggplot(alc, aes(x = high_use, y = absences, col = sex))

define the plot as a boxplot and draw it

g2 + geom_boxplot() + ggtitle("Student absences by alcohol consumption and sex")

find the model with glm()

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

alc and dlyr are available

find the model with glm()

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

compute odds ratios (OR)

OR <- coef(m) %>% exp

compute confidence intervals (CI)

CI <- confint(m) %>% exp
## Waiting for profiling to be done...

fit the model

m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")

predict() the probability of high_use

probabilities <- predict(m, type = "response")

add the predicted probabilities to ‘alc’

alc <- mutate(alc, probability = probabilities)

use the probabilities to make a prediction of high_use

alc <- mutate(alc, prediction = probability > 0.5)

see the last ten original classes, predicted probabilities, and class predictions

select(alc, failures, absences, sex, high_use, probability, prediction) %>% tail(10)
##     failures absences sex high_use probability prediction
## 373        1        0   M    FALSE   0.3749639      FALSE
## 374        1        7   M     TRUE   0.5353311       TRUE
## 375        0        1   F    FALSE   0.1406689      FALSE
## 376        0        6   F    FALSE   0.2069112      FALSE
## 377        1        2   F    FALSE   0.2199932      FALSE
## 378        0        2   F    FALSE   0.1523192      FALSE
## 379        2        2   F    FALSE   0.3068503      FALSE
## 380        0        3   F    FALSE   0.1647495      FALSE
## 381        0        4   M     TRUE   0.3568828      FALSE
## 382        0        2   M     TRUE   0.3153209      FALSE

tabulate the target variable versus the predictions

table(high_use = alc$high_use, prediction = alc$prediction)
##         prediction
## high_use FALSE TRUE
##    FALSE   259    9
##    TRUE     84   30

access dplyr and ggplot2

library(dplyr); library(ggplot2)

initialize a plot of ‘high_use’ versus ‘probability’ in ‘alc’

g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))

define the geom as points and draw the plot

g + geom_point()

tabulate the target variable versus the predictions

table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
##         prediction
## high_use      FALSE       TRUE        Sum
##    FALSE 0.67801047 0.02356021 0.70157068
##    TRUE  0.21989529 0.07853403 0.29842932
##    Sum   0.89790576 0.10209424 1.00000000

the logistic regression model m and dataset alc with predictions are available

define a loss function (average prediction error)

loss_func <- function(class, prob) {
  n_wrong <- abs(class - prob) > 0.5
  mean(n_wrong)
}

call loss_func to compute the average number of wrong predictions in the (training) data

loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555

the logistic regression model m and dataset alc (with predictions) are available

define a loss function (average prediction error)

loss_func <- function(class, prob) {
  n_wrong <- abs(class - prob) > 0.5
  mean(n_wrong)
}

compute the average number of wrong predictions in the (training) data

loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555

K-fold cross-validation

library(boot)
cv <- cv.glm(data = alc, cost = loss_func, glmfit = m, K = 10)

average number of wrong predictions in the cross validation

cv$delta[1]
## [1] 0.2591623

Excercise 4

access the MASS package

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

load the data

data("Boston")

explore the dataset

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
library(tidyr)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)

calculate the correlation matrix and round it

cor_matrix<-cor(Boston) %>% round(digits = 2 )

visualize the correlation matrix

corrplot(cor_matrix, method="circle", type="upper", cl.pos="b", tl.pos="d", tl.cex = 0.6)

It seems that “dis” is closely related to “age”, “nox”, “indus” and “zn”.

center and standardize variables

boston_scaled <- scale(Boston)

summaries of the scaled variables

summary(boston_scaled)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865

class of the boston_scaled object

class(boston_scaled)
## [1] "matrix"

change the object to data frame

boston_scaled <- as.data.frame(boston_scaled)

summary of the scaled crime rate

summary(boston_scaled$crim)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.419367 -0.410563 -0.390280  0.000000  0.007389  9.924110

create a quantile vector of crim and print it

bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610

create a categorical variable ‘crime’

crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))

look at the table of the new factor crime

table(crime)
## crime
##      low  med_low med_high     high 
##      127      126      126      127

remove original crim from the dataset

boston_scaled <- dplyr::select(boston_scaled, -crim)

add the new categorical value to scaled data

boston_scaled <- data.frame(boston_scaled, crime)

number of rows in the Boston dataset

n <- nrow(Boston)

choose randomly 80% of the rows

ind <- sample(n,  size = n * 0.8)

create train set

train <- boston_scaled[ind,]

create test set

test <- boston_scaled[-ind,]

save the correct classes from test data

correct_classes <- test$crime

remove the crime variable from test data

test <- dplyr::select(test, -crime)

linear discriminant analysis

lda.fit <- lda(crime ~ ., data = train)

the function for lda biplot arrows

lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

target classes as numeric

classes <- as.numeric(train$crime)

plot the lda results

plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

predict classes with test data

lda.pred <- predict(lda.fit, newdata = test)

cross tabulate the results

table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       11       7        0    0
##   med_low    3      17        7    0
##   med_high   0       6       22    1
##   high       0       0        0   28

load MASS and Boston

library(MASS)
data('Boston')

center and standardize variables

boston_scaled <- scale(Boston)

change the object to data frame from matrix type.

boston_scaled <- as.data.frame(boston_scaled)

Calculate the Euclidean distances between observations.

dist_eu <- dist(boston_scaled)

look at the summary of the distances

summary(dist_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970

K-means clustering

km <-kmeans(boston_scaled, centers = 3)

plot the Boston dataset with clusters

pairs(boston_scaled, col = km$cluster)

Boston dataset is available

set.seed(123)

determine the number of clusters

k_max <- 10

calculate the total within sum of squares

twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss})

visualize the results

library(ggplot2)
qplot(x = 1:k_max, y = twcss, geom = 'line')

k-means clustering

km <-kmeans(boston_scaled, centers = 2)

plot the Boston dataset with clusters

pairs(boston_scaled, col = km$cluster)

ggpairs

library(GGally)
ggpairs(boston_scaled)

Super bonus

model_predictors <- dplyr::select(train, -crime)

check the dimensions

dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3

matrix multiplication

matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

3D figure

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime )

First one needs to do k-means with 4 clusters to compare the methods.

km3D <-kmeans(boston_scaled, centers = 4)

3D figure

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km3D$cluster[ind])

Excercise 5

human <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human2.txt", sep  =",", header = T)

Graphical overview of the data

str(human)
## 'data.frame':    155 obs. of  8 variables:
##  $ Edu2.FM  : num  1.007 0.997 0.983 0.989 0.969 ...
##  $ Labo.FM  : num  0.891 0.819 0.825 0.884 0.829 ...
##  $ Edu.Exp  : num  17.5 20.2 15.8 18.7 17.9 16.5 18.6 16.5 15.9 19.2 ...
##  $ Life.Exp : num  81.6 82.4 83 80.2 81.6 80.9 80.9 79.1 82 81.8 ...
##  $ GNI      : int  64992 42261 56431 44025 45435 43919 39568 52947 42155 32689 ...
##  $ Mat.Mor  : int  4 6 6 5 6 7 9 28 11 8 ...
##  $ Ado.Birth: num  7.8 12.1 1.9 5.1 6.2 3.8 8.2 31 14.5 25.3 ...
##  $ Parli.F  : num  39.6 30.5 28.5 38 36.9 36.9 19.9 19.4 28.2 31.4 ...
dim(human)
## [1] 155   8
summary(human)
##     Edu2.FM          Labo.FM          Edu.Exp         Life.Exp    
##  Min.   :0.1717   Min.   :0.1857   Min.   : 5.40   Min.   :49.00  
##  1st Qu.:0.7264   1st Qu.:0.5984   1st Qu.:11.25   1st Qu.:66.30  
##  Median :0.9375   Median :0.7535   Median :13.50   Median :74.20  
##  Mean   :0.8529   Mean   :0.7074   Mean   :13.18   Mean   :71.65  
##  3rd Qu.:0.9968   3rd Qu.:0.8535   3rd Qu.:15.20   3rd Qu.:77.25  
##  Max.   :1.4967   Max.   :1.0380   Max.   :20.20   Max.   :83.50  
##       GNI            Mat.Mor         Ado.Birth         Parli.F     
##  Min.   :   581   Min.   :   1.0   Min.   :  0.60   Min.   : 0.00  
##  1st Qu.:  4198   1st Qu.:  11.5   1st Qu.: 12.65   1st Qu.:12.40  
##  Median : 12040   Median :  49.0   Median : 33.60   Median :19.30  
##  Mean   : 17628   Mean   : 149.1   Mean   : 47.16   Mean   :20.91  
##  3rd Qu.: 24512   3rd Qu.: 190.0   3rd Qu.: 71.95   3rd Qu.:27.95  
##  Max.   :123124   Max.   :1100.0   Max.   :204.80   Max.   :57.50

library(MASS) library(tidyr) library(dplyr)

To see structure and relationahip

cor(human) 
##                Edu2.FM      Labo.FM     Edu.Exp   Life.Exp         GNI
## Edu2.FM    1.000000000  0.009564039  0.59325156  0.5760299  0.43030485
## Labo.FM    0.009564039  1.000000000  0.04732183 -0.1400125 -0.02173971
## Edu.Exp    0.593251562  0.047321827  1.00000000  0.7894392  0.62433940
## Life.Exp   0.576029853 -0.140012504  0.78943917  1.0000000  0.62666411
## GNI        0.430304846 -0.021739705  0.62433940  0.6266641  1.00000000
## Mat.Mor   -0.660931770  0.240461075 -0.73570257 -0.8571684 -0.49516234
## Ado.Birth -0.529418415  0.120158862 -0.70356489 -0.7291774 -0.55656208
## Parli.F    0.078635285  0.250232608  0.20608156  0.1700863  0.08920818
##              Mat.Mor  Ado.Birth     Parli.F
## Edu2.FM   -0.6609318 -0.5294184  0.07863528
## Labo.FM    0.2404611  0.1201589  0.25023261
## Edu.Exp   -0.7357026 -0.7035649  0.20608156
## Life.Exp  -0.8571684 -0.7291774  0.17008631
## GNI       -0.4951623 -0.5565621  0.08920818
## Mat.Mor    1.0000000  0.7586615 -0.08944000
## Ado.Birth  0.7586615  1.0000000 -0.07087810
## Parli.F   -0.0894400 -0.0708781  1.00000000
library(GGally)
library(ggplot2)
library(dplyr)
library(tidyr)
ggpairs(human)

library(ggplot2)
library(corrplot)
cor(human) %>% corrplot( method="circle", type="upper", title= "Graph 1. Correlations between variables in human data")

4 hypothesis 1) negative relationship between Edu2.FM and Mat.Mor 2) negative relationship between Edu.Exp and Mat.Mor 3) negative relationship between Life.Exp and Mat.Mor 4) negative relationship between GNI and Mat.Mor

Principal component analysis

pca_human <- prcomp(human)
pca_human
## Standard deviations (1, .., p=8):
## [1] 1.854416e+04 1.855219e+02 2.518701e+01 1.145441e+01 3.766241e+00
## [6] 1.565912e+00 1.912052e-01 1.591112e-01
## 
## Rotation (n x k) = (8 x 8):
##                     PC1           PC2           PC3           PC4
## Edu2.FM   -5.607472e-06  0.0006713951 -3.412027e-05 -2.736326e-04
## Labo.FM    2.331945e-07 -0.0002819357  5.302884e-04 -4.692578e-03
## Edu.Exp   -9.562910e-05  0.0075529759  1.427664e-02 -3.313505e-02
## Life.Exp  -2.815823e-04  0.0283150248  1.294971e-02 -6.752684e-02
## GNI       -9.999832e-01 -0.0057723054 -5.156742e-04  4.932889e-05
## Mat.Mor    5.655734e-03 -0.9916320120  1.260302e-01 -6.100534e-03
## Ado.Birth  1.233961e-03 -0.1255502723 -9.918113e-01  5.301595e-03
## Parli.F   -5.526460e-05  0.0032317269 -7.398331e-03 -9.971232e-01
##                     PC5           PC6           PC7           PC8
## Edu2.FM   -0.0022935252  2.180183e-02  6.998623e-01  7.139410e-01
## Labo.FM    0.0022190154  3.264423e-02  7.132267e-01 -7.001533e-01
## Edu.Exp    0.1431180282  9.882477e-01 -3.826887e-02  7.776451e-03
## Life.Exp   0.9865644425 -1.453515e-01  5.380452e-03  2.281723e-03
## GNI       -0.0001135863 -2.711698e-05 -8.075191e-07 -1.176762e-06
## Mat.Mor    0.0266373214  1.695203e-03  1.355518e-04  8.371934e-04
## Ado.Birth  0.0188618600  1.273198e-02 -8.641234e-05 -1.707885e-04
## Parli.F   -0.0716401914 -2.309896e-02 -2.642548e-03  2.680113e-03
biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col= c("grey40", "deeppink2"), main = "Grpagh 2. PCA with human data, unscaled")
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

It is very hard to see

To standardize the variables in the human data

human.std <- scale(human)
pca_human2 <- prcomp(human.std)
biplot(pca_human2, choices= 1:2, cex = c(0.8, 1), col= c("grey40", "deeppink2"), main="Graph 3. PCA with human data, scaled")

Not clear so we will change it

s <- summary(pca_human2)
pca_pr <- round(100*s$importance[2, ], digits = 1)
pc_lab <- paste0(names(pca_pr), " (", pca_pr, "%)")
biplot(pca_human2, cex = c(0.8, 1), col = c("grey40", "deeppink2"), xlab = pc_lab[1], ylab = pc_lab[2], main="Graph 4. PCA with human data, scaled, %")

looks good but still hard to see

library(FactoMineR)
es.pca = PCA(human, scale.unit= TRUE, ncp=5)

Much better now

tea dataset

library(ggplot2)
library(dplyr)
library(tidyr)
library(FactoMineR)
data(tea)
my_tea <- data.frame(tea)
dim(my_tea)
## [1] 300  36
str(my_tea)
## 'data.frame':    300 obs. of  36 variables:
##  $ breakfast       : Factor w/ 2 levels "breakfast","Not.breakfast": 1 1 2 2 1 2 1 2 1 1 ...
##  $ tea.time        : Factor w/ 2 levels "Not.tea time",..: 1 1 2 1 1 1 2 2 2 1 ...
##  $ evening         : Factor w/ 2 levels "evening","Not.evening": 2 2 1 2 1 2 2 1 2 1 ...
##  $ lunch           : Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dinner          : Factor w/ 2 levels "dinner","Not.dinner": 2 2 1 1 2 1 2 2 2 2 ...
##  $ always          : Factor w/ 2 levels "always","Not.always": 2 2 2 2 1 2 2 2 2 2 ...
##  $ home            : Factor w/ 2 levels "home","Not.home": 1 1 1 1 1 1 1 1 1 1 ...
##  $ work            : Factor w/ 2 levels "Not.work","work": 1 1 2 1 1 1 1 1 1 1 ...
##  $ tearoom         : Factor w/ 2 levels "Not.tearoom",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ friends         : Factor w/ 2 levels "friends","Not.friends": 2 2 1 2 2 2 1 2 2 2 ...
##  $ resto           : Factor w/ 2 levels "Not.resto","resto": 1 1 2 1 1 1 1 1 1 1 ...
##  $ pub             : Factor w/ 2 levels "Not.pub","pub": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Tea             : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
##  $ How             : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
##  $ sugar           : Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
##  $ how             : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
##  $ where           : Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
##  $ price           : Factor w/ 6 levels "p_branded","p_cheap",..: 4 6 6 6 6 3 6 6 5 5 ...
##  $ age             : int  39 45 47 23 48 21 37 36 40 37 ...
##  $ sex             : Factor w/ 2 levels "F","M": 2 1 1 2 2 2 2 1 2 2 ...
##  $ SPC             : Factor w/ 7 levels "employee","middle",..: 2 2 4 6 1 6 5 2 5 5 ...
##  $ Sport           : Factor w/ 2 levels "Not.sportsman",..: 2 2 2 1 2 2 2 2 2 1 ...
##  $ age_Q           : Factor w/ 5 levels "15-24","25-34",..: 3 4 4 1 4 1 3 3 3 3 ...
##  $ frequency       : Factor w/ 4 levels "1/day","1 to 2/week",..: 1 1 3 1 3 1 4 2 3 3 ...
##  $ escape.exoticism: Factor w/ 2 levels "escape-exoticism",..: 2 1 2 1 1 2 2 2 2 2 ...
##  $ spirituality    : Factor w/ 2 levels "Not.spirituality",..: 1 1 1 2 2 1 1 1 1 1 ...
##  $ healthy         : Factor w/ 2 levels "healthy","Not.healthy": 1 1 1 1 2 1 1 1 2 1 ...
##  $ diuretic        : Factor w/ 2 levels "diuretic","Not.diuretic": 2 1 1 2 1 2 2 2 2 1 ...
##  $ friendliness    : Factor w/ 2 levels "friendliness",..: 2 2 1 2 1 2 2 1 2 1 ...
##  $ iron.absorption : Factor w/ 2 levels "iron absorption",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ feminine        : Factor w/ 2 levels "feminine","Not.feminine": 2 2 2 2 2 2 2 1 2 2 ...
##  $ sophisticated   : Factor w/ 2 levels "Not.sophisticated",..: 1 1 1 2 1 1 1 2 2 1 ...
##  $ slimming        : Factor w/ 2 levels "No.slimming",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ exciting        : Factor w/ 2 levels "exciting","No.exciting": 2 1 2 2 2 2 2 2 2 2 ...
##  $ relaxing        : Factor w/ 2 levels "No.relaxing",..: 1 1 2 2 2 2 2 2 2 2 ...
##  $ effect.on.health: Factor w/ 2 levels "effect on health",..: 2 2 2 2 2 2 2 2 2 2 ...
summary(my_tea)
##          breakfast           tea.time          evening          lunch    
##  breakfast    :144   Not.tea time:131   evening    :103   lunch    : 44  
##  Not.breakfast:156   tea time    :169   Not.evening:197   Not.lunch:256  
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##         dinner           always          home           work    
##  dinner    : 21   always    :103   home    :291   Not.work:213  
##  Not.dinner:279   Not.always:197   Not.home:  9   work    : 87  
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##         tearoom           friends          resto          pub     
##  Not.tearoom:242   friends    :196   Not.resto:221   Not.pub:237  
##  tearoom    : 58   Not.friends:104   resto    : 79   pub    : 63  
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##         Tea         How           sugar                     how     
##  black    : 74   alone:195   No.sugar:155   tea bag           :170  
##  Earl Grey:193   lemon: 33   sugar   :145   tea bag+unpackaged: 94  
##  green    : 33   milk : 63                  unpackaged        : 36  
##                  other:  9                                          
##                                                                     
##                                                                     
##                                                                     
##                   where                 price          age        sex    
##  chain store         :192   p_branded      : 95   Min.   :15.00   F:178  
##  chain store+tea shop: 78   p_cheap        :  7   1st Qu.:23.00   M:122  
##  tea shop            : 30   p_private label: 21   Median :32.00          
##                             p_unknown      : 12   Mean   :37.05          
##                             p_upscale      : 53   3rd Qu.:48.00          
##                             p_variable     :112   Max.   :90.00          
##                                                                          
##            SPC               Sport       age_Q          frequency  
##  employee    :59   Not.sportsman:121   15-24:92   1/day      : 95  
##  middle      :40   sportsman    :179   25-34:69   1 to 2/week: 44  
##  non-worker  :64                       35-44:40   +2/day     :127  
##  other worker:20                       45-59:61   3 to 6/week: 34  
##  senior      :35                       +60  :38                    
##  student     :70                                                   
##  workman     :12                                                   
##              escape.exoticism           spirituality        healthy   
##  escape-exoticism    :142     Not.spirituality:206   healthy    :210  
##  Not.escape-exoticism:158     spirituality    : 94   Not.healthy: 90  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##          diuretic             friendliness            iron.absorption
##  diuretic    :174   friendliness    :242   iron absorption    : 31   
##  Not.diuretic:126   Not.friendliness: 58   Not.iron absorption:269   
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##          feminine             sophisticated        slimming  
##  feminine    :129   Not.sophisticated: 85   No.slimming:255  
##  Not.feminine:171   sophisticated    :215   slimming   : 45  
##                                                              
##                                                              
##                                                              
##                                                              
##                                                              
##         exciting          relaxing              effect.on.health
##  exciting   :116   No.relaxing:113   effect on health   : 66    
##  No.exciting:184   relaxing   :187   No.effect on health:234    
##                                                                 
##                                                                 
##                                                                 
##                                                                 
## 
gather(my_tea) %>% ggplot(aes(value))  + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped

keep_columns <- c("breakfast", "sex", "price", "healthy", "spirituality", "friends", "Tea", "tearoom")
my_tea1 <- dplyr::select(my_tea, one_of(keep_columns))

gather(my_tea1) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped

Multiple Correspondence Analysis

mca <- MCA(my_tea1, graph = FALSE)
summary(mca)
## 
## Call:
## MCA(X = my_tea1, graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
## Variance               0.193   0.165   0.150   0.147   0.139   0.128
## % of var.             11.885  10.164   9.218   9.031   8.544   7.859
## Cumulative % of var.  11.885  22.050  31.268  40.299  48.843  56.702
##                        Dim.7   Dim.8   Dim.9  Dim.10  Dim.11  Dim.12
## Variance               0.121   0.115   0.110   0.102   0.096   0.087
## % of var.              7.429   7.054   6.784   6.297   5.928   5.371
## Cumulative % of var.  64.131  71.185  77.970  84.267  90.195  95.565
##                       Dim.13
## Variance               0.072
## % of var.              4.435
## Cumulative % of var. 100.000
## 
## Individuals (the 10 first)
##                    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 1               |  0.767  1.015  0.144 |  0.347  0.242  0.029 |  0.713
## 2               |  0.261  0.117  0.057 |  0.285  0.164  0.069 | -0.327
## 3               | -0.400  0.276  0.233 | -0.236  0.113  0.081 |  0.005
## 4               |  0.057  0.006  0.003 | -0.199  0.080  0.034 |  0.209
## 5               | -0.005  0.000  0.000 |  0.067  0.009  0.003 |  0.484
## 6               |  0.641  0.709  0.171 | -0.080  0.013  0.003 |  0.234
## 7               | -0.151  0.039  0.028 |  0.019  0.001  0.000 |  0.020
## 8               |  0.262  0.118  0.059 |  0.161  0.052  0.022 | -0.149
## 9               |  0.533  0.491  0.180 |  0.465  0.437  0.137 |  0.375
## 10              |  0.503  0.436  0.118 |  1.130  2.577  0.594 | -0.477
##                    ctr   cos2  
## 1                1.132  0.125 |
## 2                0.238  0.090 |
## 3                0.000  0.000 |
## 4                0.097  0.037 |
## 5                0.521  0.164 |
## 6                0.122  0.023 |
## 7                0.001  0.001 |
## 8                0.050  0.019 |
## 9                0.313  0.089 |
## 10               0.505  0.106 |
## 
## Categories (the 10 first)
##                     Dim.1     ctr    cos2  v.test     Dim.2     ctr
## breakfast       |  -0.002   0.000   0.000  -0.030 |   0.211   1.613
## Not.breakfast   |   0.002   0.000   0.000   0.030 |  -0.194   1.489
## F               |  -0.357   4.897   0.186  -7.458 |  -0.173   1.347
## M               |   0.521   7.144   0.186   7.458 |   0.253   1.965
## p_branded       |   0.289   1.708   0.039   3.398 |  -0.960  22.087
## p_cheap         |   0.678   0.694   0.011   1.812 |   0.329   0.191
## p_private label |   0.723   2.369   0.039   3.431 |   0.274   0.398
## p_unknown       |   0.192   0.095   0.002   0.677 |  -0.040   0.005
## p_upscale       |   0.563   3.621   0.068   4.507 |   1.183  18.709
## p_variable      |  -0.710  12.168   0.300  -9.471 |   0.187   0.986
##                    cos2  v.test     Dim.3     ctr    cos2  v.test  
## breakfast         0.041   3.500 |  -0.286   3.276   0.075  -4.751 |
## Not.breakfast     0.041  -3.500 |   0.264   3.024   0.075   4.751 |
## F                 0.044  -3.618 |  -0.242   2.905   0.086  -5.059 |
## M                 0.044   3.618 |   0.353   4.238   0.086   5.059 |
## p_branded         0.427 -11.300 |  -0.341   3.069   0.054  -4.012 |
## p_cheap           0.003   0.880 |   1.264   3.110   0.038   3.378 |
## p_private label   0.006   1.301 |   0.150   0.131   0.002   0.709 |
## p_unknown         0.000  -0.142 |   2.649  23.430   0.292   9.351 |
## p_upscale         0.300   9.475 |  -0.267   1.049   0.015  -2.137 |
## p_variable        0.021   2.493 |   0.024   0.019   0.000   0.326 |
## 
## Categorical variables (eta2)
##                   Dim.1 Dim.2 Dim.3  
## breakfast       | 0.000 0.041 0.075 |
## sex             | 0.186 0.044 0.086 |
## price           | 0.319 0.560 0.369 |
## healthy         | 0.010 0.044 0.413 |
## spirituality    | 0.082 0.019 0.000 |
## friends         | 0.412 0.000 0.000 |
## Tea             | 0.272 0.340 0.162 |
## tearoom         | 0.264 0.273 0.092 |
plot(mca, invisible=c("ind"),  habillage="quali")

res.mca=MCA(my_tea1)


Excercise 6

Read BPRSL data

BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep=" ", header=TRUE)
str(BPRS)
## 'data.frame':    40 obs. of  11 variables:
##  $ treatment: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ subject  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ week0    : int  42 58 54 55 72 48 71 30 41 57 ...
##  $ week1    : int  36 68 55 77 75 43 61 36 43 51 ...
##  $ week2    : int  36 61 41 49 72 41 47 38 39 51 ...
##  $ week3    : int  43 55 38 54 65 38 30 38 35 55 ...
##  $ week4    : int  41 43 43 56 50 36 27 31 28 53 ...
##  $ week5    : int  40 34 28 50 39 29 40 26 22 43 ...
##  $ week6    : int  38 28 29 47 32 33 30 26 20 43 ...
##  $ week7    : int  47 28 25 42 38 27 31 25 23 39 ...
##  $ week8    : int  51 28 24 46 32 25 31 24 21 32 ...
summary(BPRS)
##    treatment      subject          week0           week1      
##  Min.   :1.0   Min.   : 1.00   Min.   :24.00   Min.   :23.00  
##  1st Qu.:1.0   1st Qu.: 5.75   1st Qu.:38.00   1st Qu.:35.00  
##  Median :1.5   Median :10.50   Median :46.00   Median :41.00  
##  Mean   :1.5   Mean   :10.50   Mean   :48.00   Mean   :46.33  
##  3rd Qu.:2.0   3rd Qu.:15.25   3rd Qu.:58.25   3rd Qu.:54.25  
##  Max.   :2.0   Max.   :20.00   Max.   :78.00   Max.   :95.00  
##      week2          week3           week4           week5      
##  Min.   :26.0   Min.   :24.00   Min.   :20.00   Min.   :20.00  
##  1st Qu.:32.0   1st Qu.:29.75   1st Qu.:28.00   1st Qu.:26.00  
##  Median :38.0   Median :36.50   Median :34.50   Median :30.50  
##  Mean   :41.7   Mean   :39.15   Mean   :36.35   Mean   :32.55  
##  3rd Qu.:49.0   3rd Qu.:44.50   3rd Qu.:43.00   3rd Qu.:38.00  
##  Max.   :75.0   Max.   :76.00   Max.   :66.00   Max.   :64.00  
##      week6           week7          week8      
##  Min.   :19.00   Min.   :18.0   Min.   :20.00  
##  1st Qu.:22.75   1st Qu.:23.0   1st Qu.:22.75  
##  Median :28.50   Median :30.0   Median :28.00  
##  Mean   :31.23   Mean   :32.2   Mean   :31.43  
##  3rd Qu.:37.00   3rd Qu.:38.0   3rd Qu.:35.25  
##  Max.   :64.00   Max.   :62.0   Max.   :75.00

These data are treatment and groups

library(dplyr)
library(tidyr)
library(ggplot2)

Convert the categorical variables of both sets to factors

BPRS$treatment <- factor(BPRS$treatment)
BPRS$subject <- factor(BPRS$subject)
str(BPRS)
## 'data.frame':    40 obs. of  11 variables:
##  $ treatment: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ subject  : Factor w/ 20 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ week0    : int  42 58 54 55 72 48 71 30 41 57 ...
##  $ week1    : int  36 68 55 77 75 43 61 36 43 51 ...
##  $ week2    : int  36 61 41 49 72 41 47 38 39 51 ...
##  $ week3    : int  43 55 38 54 65 38 30 38 35 55 ...
##  $ week4    : int  41 43 43 56 50 36 27 31 28 53 ...
##  $ week5    : int  40 34 28 50 39 29 40 26 22 43 ...
##  $ week6    : int  38 28 29 47 32 33 30 26 20 43 ...
##  $ week7    : int  47 28 25 42 38 27 31 25 23 39 ...
##  $ week8    : int  51 28 24 46 32 25 31 24 21 32 ...

Convert to long form

BPRSL <-  BPRS %>% gather(key = weeks, value = bprs, -treatment, -subject)
BPRSL <-  BPRSL %>% mutate(weeks = as.integer(substr(weeks, 5,5)))

See the BPRS

glimpse(BPRSL)
## Observations: 360
## Variables: 4
## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ subject   <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ weeks     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ bprs      <int> 42, 58, 54, 55, 72, 48, 71, 30, 41, 57, 30, 55, 36, ...

Read RATS data

RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", sep="\t", header=TRUE)
str(RATS)
## 'data.frame':    16 obs. of  13 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Group: int  1 1 1 1 1 1 1 1 2 2 ...
##  $ WD1  : int  240 225 245 260 255 260 275 245 410 405 ...
##  $ WD8  : int  250 230 250 255 260 265 275 255 415 420 ...
##  $ WD15 : int  255 230 250 255 255 270 260 260 425 430 ...
##  $ WD22 : int  260 232 255 265 270 275 270 268 428 440 ...
##  $ WD29 : int  262 240 262 265 270 275 273 270 438 448 ...
##  $ WD36 : int  258 240 265 268 273 277 274 265 443 460 ...
##  $ WD43 : int  266 243 267 270 274 278 276 265 442 458 ...
##  $ WD44 : int  266 244 267 272 273 278 271 267 446 464 ...
##  $ WD50 : int  265 238 264 274 276 284 282 273 456 475 ...
##  $ WD57 : int  272 247 268 273 278 279 281 274 468 484 ...
##  $ WD64 : int  278 245 269 275 280 281 284 278 478 496 ...
RATS$Group <- factor(RATS$Group)
RATS$ID <- factor(RATS$ID)
str(RATS)
## 'data.frame':    16 obs. of  13 variables:
##  $ ID   : Factor w/ 16 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Group: Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 2 2 ...
##  $ WD1  : int  240 225 245 260 255 260 275 245 410 405 ...
##  $ WD8  : int  250 230 250 255 260 265 275 255 415 420 ...
##  $ WD15 : int  255 230 250 255 255 270 260 260 425 430 ...
##  $ WD22 : int  260 232 255 265 270 275 270 268 428 440 ...
##  $ WD29 : int  262 240 262 265 270 275 273 270 438 448 ...
##  $ WD36 : int  258 240 265 268 273 277 274 265 443 460 ...
##  $ WD43 : int  266 243 267 270 274 278 276 265 442 458 ...
##  $ WD44 : int  266 244 267 272 273 278 271 267 446 464 ...
##  $ WD50 : int  265 238 264 274 276 284 282 273 456 475 ...
##  $ WD57 : int  272 247 268 273 278 279 281 274 468 484 ...
##  $ WD64 : int  278 245 269 275 280 281 284 278 478 496 ...

Convert to long form

RATSL <-  RATS %>% gather(key = WDs, value = rats, -ID, -Group) %>%
  mutate(Time = as.integer(substr(WDs, 3, 4))) 

See the RATS

glimpse(RATSL)
## Observations: 176
## Variables: 5
## $ ID    <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1,...
## $ WDs   <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", ...
## $ rats  <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 445, 5...
## $ Time  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8,...
write.csv(RATS, "data/rats_long.csv")
write.csv(BPRS, "data/bprs_long.csv")

Graphical Displays of Longitudinal Data

Draw the plot

ggplot(BPRSL, aes(x = weeks, y = bprs, linetype = subject)) +
  geom_line() +
  scale_linetype_manual(values = rep(1:10, times=4)) +
  facet_grid(. ~ treatment, labeller = label_both) +
  theme(legend.position = "none") + 
  scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs)))

Standardise the variable bprs

BPRSL <- BPRSL %>%
  group_by(weeks) %>%
  mutate(stdbprs = (bprs - mean(bprs))/sd(bprs)) %>%
  ungroup()

Plot again with the standardised bprs

ggplot(BPRSL, aes(x = weeks, y = stdbprs, linetype = subject)) +
  geom_line() +
  scale_linetype_manual(values = rep(1:10, times=4)) +
  facet_grid(. ~ treatment, labeller = label_both) +
  scale_y_continuous(name = "standardized bprs")

Summary Measure Analysis of Longitudinal Data

Number of weeks, baseline (week 0) included

n <- BPRSL$weeks %>% unique() %>% length()

Summary data with mean and standard error of bprs by treatment and week

BPRSS <- BPRSL %>%
  group_by(treatment, weeks) %>%
  summarise( mean = mean(bprs), se = sd(bprs)/sqrt(n) ) %>%
  ungroup()

Glimpse the data

glimpse(BPRSS)
## Observations: 18
## Variables: 4
## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2
## $ weeks     <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8
## $ mean      <dbl> 47.00, 46.80, 43.55, 40.90, 36.60, 32.70, 29.70, 29....
## $ se        <dbl> 4.534468, 5.173708, 4.003617, 3.744626, 3.259534, 2....

Mean response profiles for the two treatment groups in the BPRS data.

Plot the mean profiles

ggplot(BPRSS, aes(x = weeks, y = mean, linetype = treatment, shape = treatment)) +
  geom_line() +
  scale_linetype_manual(values = c(1,2)) +
  geom_point(size=3) +
  scale_shape_manual(values = c(1,2)) +
  #geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3) +
  theme(legend.position = c(0.8,0.8)) +
  scale_y_continuous(name = "mean(bprs) +/- se(bprs)")

RATSL<-gather(RATS, key=WDs, value=weight, WD1:WD64)%>%mutate(time= as.integer(substr(WDs,3,4)))
ggplot(RATSL, aes(x = time, y = weight, linetype = ID)) +
  geom_line() +
  scale_linetype_manual(values = rep(1:10, times=4)) +
  facet_grid(. ~ Group, labeller = label_both) +
  theme(legend.position = "none") + 
  scale_y_continuous(limits = c(min(RATSL$weight), max(RATSL$weight)))

Standardise the weight

RATSL<- RATSL%>%group_by(time)%>%mutate(stdweight = (weight - mean(weight))/sd(weight))%>%ungroup()
glimpse(RATSL)
## Observations: 176
## Variables: 6
## $ ID        <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ Group     <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1...
## $ WDs       <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD...
## $ weight    <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 44...
## $ time      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8...
## $ stdweight <dbl> -1.0011429, -1.1203857, -0.9613953, -0.8421525, -0.8...
p1<-ggplot(RATSL, aes(x=time, y=stdweight, linetype=ID))
p2<-p1+geom_line()+scale_linetype_manual(values=rep(1:10, times=4))
p3<-p2+facet_grid(. ~Group, labeller = label_both)
p4<-p3+theme_bw()+theme(legend.position ="none")
p5<-p4+theme(panel.grid.minor.y=element_blank())
p6<-p5+scale_y_continuous(name="standardized weights")
p6

Numberof times, baseline (time 0)

n<-RATSL$time%>%unique()%>%length()

Make a summary data

RATSS<-RATSL%>%group_by(Group, time)%>% summarise(mean=mean(weight), se=sd(weight)/sqrt(n))%>%ungroup()
glimpse(RATSS)
## Observations: 33
## Variables: 4
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ time  <int> 1, 8, 15, 22, 29, 36, 43, 44, 50, 57, 64, 1, 8, 15, 22, ...
## $ mean  <dbl> 250.625, 255.000, 254.375, 261.875, 264.625, 265.000, 26...
## $ se    <dbl> 4.589478, 3.947710, 3.460116, 4.100800, 3.333956, 3.5529...

Visualize the mean weight and SE

p1<-ggplot(RATSS, aes(x=time, y=mean, linetype=Group, shape=Group))
p2<-p1+geom_line()+scale_linetype_manual(values=c(1,2,3))
p3<-p2+geom_point(size=3)+scale_shape_manual(values=c(1,2,3))
p4<-p3+geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3)
p5<-p4+theme_bw()+theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
p6<-p5+theme(legend.position= "top")
p7<-p6+scale_y_continuous(name="mean(weights) +/- se(weights)")
p7